home *** CD-ROM | disk | FTP | other *** search
- page 132,60,1,1
- ;*******************************************
- ;Motorola Austin DSP Operation June 30,1988
- ;*******************************************
- ;DSP960002
- ;Memory to Memory FFT - 1024 point
- ;File name: C-96.asm
- ;**************************************************************************
- ; Maximum sample rate: 1.551 ms at 27.0 MHz
- ; Memory Size: Prog: 137 words ; Data: 5120 words
- ; Number of clock cycles: 41868 (20934 instruction cycles)
- ; Clock Frequency: 27.0MHz
- ; Instruction cycle time: 74.1ns
- ;**************************************************************************
- ;
- ; Complex, Radix 2 Cooley-Tukey Decimation in Time FFT
- ; Untested
- ;
- ; Faster FFT using Programming Tricks found in Typical FORTRAN Libraries
- ;
- ; First two passes combined as a four butterfly loop since
- ; multiplies are trivial.
- ; 2.25 cycles internal (4 cycles external) per Radix 2
- ; butterfly.
- ; Middle passes performed with traditional, triple-nested DO loop.
- ; 4 cycles internal (8 cycles external) per Radix 2 butterfly
- ; plus overhead. Note that a new pipelining technique is
- ; being used to minimize overhead.
- ; Next to last pass performed with double butterfly loop.
- ; 4.5 cycles internal (8.5 cycles external) per Radix 2
- ; butterfly.
- ; Last pass has separate single butterfly loop.
- ; 5 cycles internal (9 cycles external) per Radix 2
- ; butterfly.
- ;
- ; For 1024 complex points, average Radix 2 butterfly = 3.8 cycles
- ; internal and 7.35 cycles external, assuming a single external
- ; data bus.
- ;
- ; Because of separate passes, minimum of 32 points using these
- ; optimizations. Approximately 150 program words required.
- ; Uses internal X and Y Data ROMs for twiddle factor coefficients
- ; for any size FFT up to 1024 complex points.
- ;
- ; Assuming internal program and internal data memory (or two
- ; external data buses, 1024 point complex FFT is 1.57 msec at
- ; 75 nsec instruction rate. Assuming internal program and
- ; external data memory, 1024 point complex FFT is 2.94 msec
- ; at 75 nsec instruction rate.
- ;
- ; First two passes
- ;
- ; 9 cycles internal, 1.77X faster than 4 cycle Radix 2 bfy
- ; 16 cycles external, 2.0X faster than 4 cycle Radix 2 bfy
- ;
- ; r0 = a pointer in & out
- ; r6 = a pointer in
- ; r4 = b pointer in & out
- ; r1 = c pointer in & out
- ; r5 = d pointer in & out
- ; n5 = 2
- ;
- ; normally ordered input data
- ; normally ordered output data
- ;
- move #points,d1.l
- move #passes,d9.l
- move #data,d0.l
- move #coef,m2
- move #coefsize,d2.l
-
- lsr d1 d0.l,r0
- lsr d1 r0,r2
- add d1,d0 d1.l,d8.l
- add d1,d0 d0.l,r4
- add d1,d0 d0.l,r1
- lsr d2 d0.l,r5
- lsr d2 r0,r6
- move #2,n5
- move d2.l,n6
- move #-1,m0
- move m0,m1
- move m0,m4
- move m0,m5
- move m0,m6
-
- fmove x:(r0),d1.s
- fmove x:(r1),d0.s
- fmove x:(r5)-,d2.s
- fmove y:(r5)+,d4.s
- faddsub d1,d0 x:(r4),d5.s
- faddsub d5,d2 y:(r4),d7.s
- ;
- ; Combine first two passes with trivial multiplies.
- ;
- do d8.l,_twopass
-
- faddsubr d0,d2 y:(r5),d6.s
- faddsub d7,d6 d2.s,x:(r0)+ y:(r6)+,d3.s
- faddsubr d1,d7 d0.s,x:(r4) y:(r1)+,d2.s
- faddsub d3,d2 d1.s,x:(r5)-
- faddsubr d2,d6 x:(r0)-,d1.s d4.s,y:(r5)+n5
- faddsubr d3,d5 x:(r1)-,d0.s d2.s,y:(r4)+
- faddsub d1,d0 x:(r5),d2.s d6.s,y:(r0)+
- ftfr d5,d4 x:(r4),d5.s d3.s,y:(r1)
- faddsub d5,d2 d7.s,x:(r1)+ y:(r4),d7.s
- _twopass
- fmove d4.s,y:(r5)+
- ;
- ; Middle passes
- ;
- tfr d9,d3 #4,d0.l
- clr d2 d8.l,d1.l
- sub d0,d3 d2.l,m6
- do d3.l,_end_pass
- move d0.l,n2
- move r2,r0
- lsr d1 m2,r6
- dec d1 d1.l,n0
- dec d1 d1.l,n1
- move d1.l,n3
- move n0,n4
- move n0,n5
- lea (r0)+n0,r1
- move r0,r4
- move r1,r5
- fmove x:(r6)+n6,d9.s y:(),d8.s
- fmove y:(r1),d7.s
- fmpy d8,d7,d3 x:(r1)+,d6.s
- fmpy d9,d6,d0
- fmpy d9,d7,d1 y:(r1),d7.s
- fmpy d8,d6,d2 fadd d3,d0 x:(r0),d4.s
- fmpy d8,d7,d3 faddsubr d4,d0 x:(r1)+,d6.s
-
- do n2,_end_grp
-
- do n3,_end_bfy
- fmpy d9,d6,d0 fsub d1,d2 d0.s,x:(r4) y:(r0)+,d5.s
- fmpy d9,d7,d1 faddsubr d5,d2 d4.s,x:(r5) y:(r1),d7.s
- fmpy d8,d6,d2 fadd d3,d0 x:(r0),d4.s d2.s,y:(r5)+
- fmpy d8,d7,d3 faddsubr d4,d0 x:(r1)+,d6.s d5.s,y:(r4)+
- _end_bfy
- move (r1)+n1
- fmpy d9,d6,d0 fsub d1,d2 d0.s,x:(r4) y:(r0)+,d5.s
- fmpy d9,d7,d1 faddsubr d5,d2 d4.s,x:(r5) y:(r1),d7.s
- fmpy d8,d6,d2 fadd d3,d0 x:(r0),d4.s d2.s,y:(r5)+
- fmove x:(r6)+n6,d9.s y:(),d8.s
- fmpy d8,d7,d3 faddsubr d4,d0 x:(r1)+,d6.s d5.s,y:(r4)+
- fmpy d9,d6,d0 fsub d1,d2 d0.s,x:(r4) y:(r0)+n0,d5.s
- fmpy d9,d7,d1 faddsubr d5,d2 d4.s,x:(r5) y:(r1),d7.s
- fmpy d8,d6,d2 fadd d3,d0 x:(r0),d4.s d2.s,y:(r5)+n5
- fmpy d8,d7,d3 faddsubr d4,d0 x:(r1)+,d6.s d5.s,y:(r4)+n4
- _end_grp
- move n2,d0.l
- lsl d0 n0,d1.l
- _end_pass
- ;
- ; next to last pass
- ;
- move d0.l,n2
- move r2,r0
- move r0,r4
- lea (r0)+2,r1
- move r1,r5
- move m2,r6
- move #3,n0
- move n0,n1
- move n0,n4
- move n0,n5
- fmove x:(r6)+n6,d9.s y:(),d8.s
- fmove y:(r1),d7.s
- fmpy d8,d7,d3 x:(r1)+,d6.s
- fmpy d9,d6,d0
- fmpy d9,d7,d1 y:(r1),d7.s
- fmpy d8,d6,d2 fadd d3,d0 x:(r0),d4.s
- fmpy d8,d7,d3 faddsubr d4,d0 x:(r1)+n1,d6.s
-
- do n2,_end_next
- fmpy d9,d6,d0 fsub d1,d2 d0.s,x:(r4) y:(r0)+,d5.s
- fmpy d9,d7,d1 faddsubr d5,d2 d4.s,x:(r5) y:(r1),d7.s
- fmpy d8,d6,d2 fadd d3,d0 x:(r0),d4.s d2.s,y:(r5)+
- fmove x:(r6)+n6,d9.s y:(),d8.s
- fmpy d8,d7,d3 faddsubr d4,d0 x:(r1)+,d6.s d5.s,y:(r4)+
- fmpy d9,d6,d0 fsub d1,d2 d0.s,x:(r4) y:(r0)+n0,d5.s
- fmpy d9,d7,d1 faddsubr d5,d2 d4.s,x:(r5) y:(r1),d7.s
- fmpy d8,d6,d2 fadd d3,d0 x:(r0),d4.s d2.s,y:(r5)+n5
- fmpy d8,d7,d3 faddsubr d4,d0 x:(r1)+n1,d6.s d5.s,y:(r4)+n4
- _end_next
- ;
- ; last pass
- ;
- move n2,d0.l ;# previous groups ->d0
- lsl d0 r2,r0 ;update # groups, r0 points to input data A
- move d0.l,n2 ;# stages ->n2
- move #odata,r4 ;r4 points to A output
- lea (r0)+,r1 ;r1 points to B input
- move r4,r2 ;r2 points to A output
- move m2,r6 ;r6 points to twiddle factors
- move #2,n0 ;offset is 2 for A input pointer
- move n0,n1 ;offset is 2 for B input pointer
- lea (r2)+n2,r5 ;r5 points to B output
- move #points/4,n4 ;offset is #points/4 for A output pointer
- move n4,n5 ;offset is #points/4 for B output pointer
- move #0,m4 ;bit reversed addressing for A output pointer
- move m4,m5 ;bit reversed addressing for B output pointer
-
- fmove x:(r6)+n6,d9.s y:(),d8.s
- fmove y:(r1),d7.s
- fmpy d8,d7,d3 x:(r1)+n1,d6.s
- fmpy d9,d6,d0
- fmpy d9,d7,d1 y:(r1),d7.s
- fmpy d8,d6,d2 fadd d3,d0 x:(r0),d4.s
- fmove x:(r6)+n6,d9.s y:(),d8.s
- fmpy d8,d7,d3 faddsubr d4,d0 x:(r1)+n1,d6.s
-
- do n2,_end_last
- fmpy d9,d6,d0 fsub d1,d2 d0.s,x:(r4) y:(r0)+n0,d5.s
- fmpy d9,d7,d1 faddsubr d5,d2 d4.s,x:(r5) y:(r1),d7.s
- fmpy d8,d6,d2 fadd d3,d0 x:(r0),d4.s d2.s,y:(r5)+n5
- fmove x:(r6)+n6,d9.s y:(),d8.s
- fmpy d8,d7,d3 faddsubr d4,d0 x:(r1)+n1,d6.s d5.s,y:(r4)+n4
- _end_last
-
-
-